* Reset settings and initialize log file
launch, path("build/cps_bms_sampler")

*-------------------------------------------------------------------------------
* Price and Wasserman (2024), "The Summer Drop in Female Employment"
*
* Description: Construct the main CPS sample.
*-------------------------------------------------------------------------------


* Use BLS rules to calculate number of weeks elapsed between CPS reference weeks
*-------------------------------------------------------------------------------

* Populate months (and the month prior) that fall within our sample period
clear
set obs `=12 * (2019 - 1989 + 1) + 1'
gen int tm = ym(1988, 12) + _n - 1

* Extract month and year
gen byte month = month(dofm(tm))
gen int year = year(dofm(tm))

* In most months, the reference week contains the 12th day of the month
gen int twelfth_of_month = mdy(month, 12, year)
gen int ref_date = twelfth_of_month + 6 - dow(twelfth_of_month)
format ref_date %td

* Calculate the date of Thanksgiving (fourth Thursday in the month)
gen int first_of_month = mdy(month, 1, year)
gen byte first_thursday = mod(4 - dow(first_of_month), 7) + 1
gen byte thanksgiving = first_thursday + 21
assert inrange(thanksgiving, 22, 28)

* For November, shift 1 week earlier if Thanksgiving falls amid the week containing the 19th
replace ref_date = ref_date - 7 if month == 11 & inlist(thanksgiving, 22, 23)

* For December, shift 1 week earlier if the week containing the 5th falls fully within December
gen int fifth_of_month = mdy(month, 5, year)
replace ref_date = ref_date - 7 if month == 12 & inrange(dow(fifth_of_month), 0, 4)

* Keep a mapping between CPS months and reference dates
keep tm ref_date

* Record the number of weeks elapsed since the previous reference week
sort tm
assert mod(ref_date - ref_date[_n - 1], 7) == 0 if _n > 1
gen byte weeks = round((ref_date - ref_date[_n - 1])/7)

* Drop the first observation
assert _n == 1 if missing(weeks)
drop if missing(weeks)

* Store for merging
keep tm weeks
tempfile weeks
save `weeks'


* Prepare data on the presence of children in the household
*-------------------------------------------------------------------------------

* Compute the number and ages of own children under 18 in the household
gzuse hid pernum tm age momloc poploc momloc2 poploc2 using "$basepath/data/derived/cps_bms_children.dta.gz", clear
rename pernum child_pernum

* Verify that second mother/father links can be populated only if primary links are populated
assert momloc2 == 0 if momloc == 0
assert poploc2 == 0 if poploc == 0

* Switch to one observation per child x parent
rename momloc parent_pernum1
rename poploc parent_pernum2
rename momloc2 parent_pernum3
rename poploc2 parent_pernum4
reshape long parent_pernum, i(hid child_pernum tm) j(parent_type)

* Drop vacuous parent linkages
drop if parent_pernum == 0

* Verify that a given parent is never listed as both a mother and a father
bysort hid parent_pernum tm: assert parent_type == parent_type[1]
drop parent_type

* For each parent, tally the number of children by child age
foreach a of numlist 0/17 {
	gen byte ownkids_age`a' = (age == `a')
}

rename parent_pernum pernum
gcollapse (count) ownkids = child_pernum (sum) ownkids_age* (min) youngest_years = age, by(hid pernum tm) sumcheck
compress

* Store a dataset listing the age breakdown of each parent's children
tempfile ownkids
save `ownkids'


* Prepare data on adult respondents
*-------------------------------------------------------------------------------

* Load cleaned CPS basic monthly data
gzuse pid hid tm pernum mish wtfinl marbasecidp linked_monthly linked_complete female age wbho educ marstat state_fips school_status emp unemp nlf ftemp ptemp absent empstat occ1990 occ1990_2d ind1990 ahrsworkt whyunemp whynilf whyabsnt earnweek earnwt paid_absence selfemp hourly hourwage uhrsworkt sploc using "$basepath/data/derived/cps_bms.dta.gz", clear

* Extract calendar year/month
gen int year = year(dofm(tm))
gen byte month = month(dofm(tm))

capture label drop month_lbl
forvalues m = 1/12 {
	local mlbl: word `m' of `=c(Months)'
	label define month_lbl `m' "`mlbl'", add
}
label values month month_lbl

* Record labor force participation
gen byte lfp = 1 - nlf

* Construct age bins (for the prime-age sample only)
gen byte age_bin = .
replace age_bin = 1 if inrange(age, 25, 29)
replace age_bin = 2 if inrange(age, 30, 34)
replace age_bin = 3 if inrange(age, 35, 39)
replace age_bin = 4 if inrange(age, 40, 44)
replace age_bin = 5 if inrange(age, 45, 49)

label define age_bin_lbl 1 "Age 25-29", replace
label define age_bin_lbl 2 "Age 30-34", add
label define age_bin_lbl 3 "Age 35-39", add
label define age_bin_lbl 4 "Age 40-44", add
label define age_bin_lbl 5 "Age 45-49", add
label values age_bin age_bin_lbl

* Set actual hours worked to zero for non-workers
assert inrange(empstat, 12, 36) if missing(ahrsworkt)
replace ahrsworkt = 0 if missing(ahrsworkt)
assert absent == 1 if emp == 1 & ahrsworkt == 0
rename ahrsworkt hours

* Specify a linear spline in calendar time with knots at key turning points
#delimit ;
mkspline
	tmspline1 `=ym(1993, 02)'
	tmspline2 `=ym(2000, 04)'
	tmspline3 `=ym(2003, 10)'
	tmspline4 `=ym(2007, 01)'
	tmspline5 `=ym(2010, 06)'
	tmspline6 `=ym(2014, 10)'
	tmspline7 = tm,
displayknots;
#delimit cr

* Tag individuals who are married with a spouse present
gen byte spouse_present = (marstat == 1)

* Merge in information about own children under 18
merge m:1 hid pernum tm using `ownkids', keep(1 3) nogenerate
foreach v of varlist ownkids ownkids_age* {
	replace `v' = 0 if missing(`v')
}

* Create a summary measure of household structure
gen byte hhstatus = .
replace hhstatus = 1 if spouse_present == 0 & ownkids == 0
replace hhstatus = 2 if spouse_present == 0 & ownkids  > 0
replace hhstatus = 3 if spouse_present == 1 & ownkids == 0
replace hhstatus = 4 if spouse_present == 1 & ownkids  > 0

label define hhstatus_lbl 1 "No spouse present, no children present", replace
label define hhstatus_lbl 2 "No spouse present, parent", add
label define hhstatus_lbl 3 "Spouse present, no children present", add
label define hhstatus_lbl 4 "Spouse present, parent", add
label values hhstatus hhstatus_lbl

* Classify parents by the age of their youngest child (if any)
gen byte youngest = .
replace youngest = 0 if ownkids == 0
replace youngest = 1 if inrange(youngest_years,  0, 5)
replace youngest = 2 if inrange(youngest_years,  6, 12)
replace youngest = 3 if inrange(youngest_years, 13, 17)
assert !missing(youngest)

label define youngest_lbl 0 "No child < 18", replace
label define youngest_lbl 1 "Youngest < 6", add
label define youngest_lbl 2 "Youngest 6-12", add
label define youngest_lbl 3 "Youngest 13-17", add
label values youngest youngest_lbl

* Tag respondents who report being in the education sector or being teachers
gen byte school  = inlist(ind1990, 842, 850, 851, 852, 860)
gen byte teacher = inrange(occ1990, 113, 163)

* Merge in time elapsed since the reference date
merge m:1 tm using `weeks', assert(2 3) keep(3) keepusing(weeks) nogenerate


* Prepare earnings measures from 1994 onward (when paid/unpaid absence observed)
*-------------------------------------------------------------------------------

* Sanity check earnings variable
assert missing(earnweek) if !inlist(mish, 4, 8)
assert missing(earnweek) if inlist(mish, 4, 8) & year >= 1994 & (emp == 0 | selfemp == 1)

* Tag observations for which we'll estimate earnings
gen byte earn_sample = (inlist(mish, 4, 8) & year >= 1994)

* Weekly earnings are occasionally missing for wage or salary workers
count if earn_sample == 1 & missing(earnweek) & emp == 1 & selfemp == 0
count if earn_sample == 1 & missing(hourwage) & emp == 1 & selfemp == 0 & hourly == 1

* Set earnings for the non-employed
gen earnings = .
replace earnings = 0 if earn_sample == 1 & emp == 0

* Set earnings for salaried workers
replace earnings = earnweek       if earn_sample == 1 & emp == 1 & hourly == 0 & hours > 0
replace earnings = earnweek       if earn_sample == 1 & emp == 1 & hourly == 0 & hours == 0 & paid_absence == 1
replace earnings = 0              if earn_sample == 1 & emp == 1 & hourly == 0 & hours == 0 & paid_absence == 0

* Set earnings for hourly workers
replace earnings = hourwage * hours if earn_sample == 1 & emp == 1 & hourly == 1 & hours > 0 & !missing(hourwage)
replace earnings = earnweek       if earn_sample == 1 & emp == 1 & hourly == 1 & hours > 0 & missing(hourwage)
replace earnings = earnweek       if earn_sample == 1 & emp == 1 & hourly == 1 & hours == 0 & paid_absence == 1
replace earnings = 0              if earn_sample == 1 & emp == 1 & hourly == 1 & hours == 0 & paid_absence == 0


* Finalize extract
*-------------------------------------------------------------------------------

* Label newly created variables
label variable year            "Calendar year"
label variable month           "Calendar month"
label variable age_bin         "Age (5-year bins)"
label variable spouse_present  "Married with the spouse present"
label variable ownkids         "Number of own children under 18 present in household"
label variable youngest_years  "Age of youngest own child under 18 in the household"
label variable youngest        "Binned age of youngest own child in the household"
label variable hhstatus        "Household structure"
label variable lfp             "Labor force participant (== 1 - nlf)"
label variable hours           "Actual hours worked last week (= 0 if not at work)"
label variable school          "Industry is educational services"
label variable teacher         "Occupation is teaching"
label variable tmspline1       "Linear spline in calendar time"
label variable tmspline2       "Linear spline in calendar time"
label variable tmspline3       "Linear spline in calendar time"
label variable tmspline4       "Linear spline in calendar time"
label variable tmspline5       "Linear spline in calendar time"
label variable tmspline6       "Linear spline in calendar time"
label variable tmspline7       "Linear spline in calendar time"
label variable weeks           "Number of weeks elapsed since previous month's reference week"
label variable earn_sample     "Included in earnings analyses"
label variable earnings        "Estimated earnings"

foreach a of numlist 0/17 {
	label variable ownkids_age`a' "Number of own children age `a' in household"
}

* Organize variables
#delimit ;
order
	pid hid tm year month pernum mish wtfinl
	marbasecidp tmspline* weeks
	linked_monthly linked_complete
	female age age_bin wbho educ state_fips school_status
	marstat spouse_present hhstatus sploc
	ownkids ownkids_age* youngest youngest_years
	emp unemp nlf lfp ftemp ptemp absent empstat hours
	occ1990 occ1990_2d ind1990 school teacher
	whyunemp whynilf whyabsnt paid_absence
	earnweek earnwt earn_sample earnings
	selfemp hourly hourwage uhrsworkt;
#delimit cr

* Finalize data
compress
sort pid tm

* Save a broad sample imposing no age restrictions
gzsave "$basepath/data/derived/cps_bms_sample_allages.dta.gz", replace

* Save the main estimation sample: ages 25 to 49
keep if inrange(age, 25, 49)
gzsave "$basepath/data/derived/cps_bms_sample.dta.gz", replace

* Close the log file
unlaunch
